import jieba
import pandas as pd

from __00__config import Config


def process_data(datapath, processed_datapath):
	df = pd.read_csv(datapath, sep=',', encoding='utf-8')
	# 查看数据信息
	# print(df.info())
	# 对question列进行分词处理
	df['words'] = df['questions'].apply(lambda x: ' '.join(jieba.lcut(x)[:75]))
	# print(df['words'].head(10))
	# 查看分词后样本长度分布
	# df['word_length'] = df['words'].apply(lambda x: len(x.split(' ')))
	# print(df['word_length'].describe())
	"""
	count    106609.000000
	mean         50.245101
	std          18.050553
	min           3.000000
	25%          39.000000
	50%          48.000000
	75%          58.000000
	max         188.000000
	"""
	df.to_csv(processed_datapath, sep='\t', encoding='utf-8', index=False)
	print(f'文件保存成功，路径为：{processed_datapath}')


if __name__ == '__main__':
	config = Config()
	process_data(config.train_datapath, config.process_train_datapath)
	process_data(config.test_datapath, config.process_test_datapath)
	process_data(config.dev_datapath, config.process_dev_datapath)
